import numpy as np
import pandas as pd
#Data Reader from Internet
#pip install pandas-datareader
import matplotlib.pyplot as plt
import seaborn as sns
#visualization inside Jupyter Notebook
%matplotlib inline
#display image in Jupyter Notebook
from IPython.display import Image
###################
# Interative plots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# get version
from plotly import __version__
#print(__version__)
import cufflinks as cf
# For Notebooks
init_notebook_mode(connected=True)
# For offline use
cf.go_offline()
#[[Plotly "after May 2020"
# + pip install chart-studio
#import chart-studio.plotly as py
###################
#Machine Learning (pip install scikit-learn)
#from sklearn.la_famille_de_modeles import le_modele
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix
#from sklearn.datasets import load_boston #deprecated
# BUT other available (load_breast_cancer, l....)
# MatPlotLib rearrange display
fig.tight_layout()
# or
plt.tight_layout()
# SeaBorn Load dataset
tips = sns.load_dataset('tips')
train = pd.read_csv('datas/titanic_train.csv') #index_col=0)
train # 891 x 12
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
891 rows × 11 columns
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
train.describe()
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
##################
# show null datas
##################
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
<AxesSubplot: >
####################################
# drop a column
####################################
train.drop('Cabin',axis=1 #axis=0 for rows, axis=1 for columns
,inplace=True) #inplace to apply on data train (not a copy)
train # 891 x 11
####################################
# fill missing values (here Age replaced by mean age in Pclass)
####################################
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37
elif Pclass == 2:
return 29
else:
return 24
else:
return Age
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
891 rows × 11 columns
####################################
# drop null values (DROPS THE ROW containing null values)
####################################
train.dropna(inplace=True)
train # 889 x 11
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
889 rows × 11 columns
train.drop('male',axis=1 #axis=0 for rows, axis=1 for columns
,inplace=True)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q |
889 rows × 11 columns
####################################
# replace non numeric values (short)
####################################
loans = pd.read_csv('datas/loan_data.csv')
#cat_feats = ['purpose']
final_data = pd.get_dummies(loans,columns=['purpose']) # ,drop_first=True
loans['purpose'].unique()
array(['debt_consolidation', 'credit_card', 'all_other',
'home_improvement', 'small_business', 'major_purchase',
'educational'], dtype=object)
final_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 int.rate 9578 non-null float64 2 installment 9578 non-null float64 3 log.annual.inc 9578 non-null float64 4 dti 9578 non-null float64 5 fico 9578 non-null int64 6 days.with.cr.line 9578 non-null float64 7 revol.bal 9578 non-null int64 8 revol.util 9578 non-null float64 9 inq.last.6mths 9578 non-null int64 10 delinq.2yrs 9578 non-null int64 11 pub.rec 9578 non-null int64 12 not.fully.paid 9578 non-null int64 13 purpose_all_other 9578 non-null uint8 14 purpose_credit_card 9578 non-null uint8 15 purpose_debt_consolidation 9578 non-null uint8 16 purpose_educational 9578 non-null uint8 17 purpose_home_improvement 9578 non-null uint8 18 purpose_major_purchase 9578 non-null uint8 19 purpose_small_business 9578 non-null uint8 dtypes: float64(6), int64(7), uint8(7) memory usage: 1.0 MB
loans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 purpose 9578 non-null object 2 int.rate 9578 non-null float64 3 installment 9578 non-null float64 4 log.annual.inc 9578 non-null float64 5 dti 9578 non-null float64 6 fico 9578 non-null int64 7 days.with.cr.line 9578 non-null float64 8 revol.bal 9578 non-null int64 9 revol.util 9578 non-null float64 10 inq.last.6mths 9578 non-null int64 11 delinq.2yrs 9578 non-null int64 12 pub.rec 9578 non-null int64 13 not.fully.paid 9578 non-null int64 dtypes: float64(6), int64(7), object(1) memory usage: 1.0+ MB
####################################
# replace non numeric values
####################################
pd.get_dummies(train['Sex']) # returns tab with columns = values and rows = true/false (1/0)
# for sex it's male or female, so we can drop first column
pd.get_dummies(train['Sex'],drop_first=True)
# save in tab
sex = pd.get_dummies(train['Sex'],drop_first=True)
# add it to our data
train = pd.concat([train,sex],axis=1)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | 1 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | 0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | 0 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | 0 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | S | 1 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | S | 0 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 24.0 | 1 | 2 | W./C. 6607 | 23.4500 | S | 0 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C | 1 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | Q | 1 |
889 rows × 12 columns
train['Embarked'].unique() #unique values in a column
array(['S', 'C', 'Q'], dtype=object)
pd.get_dummies(train['Embarked'],drop_first=True)
| Q | S | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 1 |
| 3 | 0 | 1 |
| 4 | 0 | 1 |
| ... | ... | ... |
| 886 | 0 | 1 |
| 887 | 0 | 1 |
| 888 | 0 | 1 |
| 889 | 0 | 0 |
| 890 | 1 | 0 |
889 rows × 2 columns
# save in tab
embarked = pd.get_dummies(train['Embarked'],drop_first=True)
# add it to our data
train = pd.concat([train,embarked],axis=1)
train.drop('Embarked',axis=1,inplace=True)
train
| Survived | Pclass | Age | SibSp | Parch | Fare | male | Q | S | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 1 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 0 | 0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 0 | 0 | 1 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | 27.0 | 0 | 0 | 13.0000 | 1 | 0 | 1 |
| 887 | 1 | 1 | 19.0 | 0 | 0 | 30.0000 | 0 | 0 | 1 |
| 888 | 0 | 3 | 24.0 | 1 | 2 | 23.4500 | 0 | 0 | 1 |
| 889 | 1 | 1 | 26.0 | 0 | 0 | 30.0000 | 1 | 0 | 0 |
| 890 | 0 | 3 | 32.0 | 0 | 0 | 7.7500 | 1 | 1 | 0 |
889 rows × 9 columns
train.drop('Q',axis=1,inplace=True)
train
| Survived | Pclass | Age | SibSp | Parch | Fare | Embarked | male | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | S | 1 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | C | 0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | S | 0 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | S | 0 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | S | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | 27.0 | 0 | 0 | 13.0000 | S | 1 |
| 887 | 1 | 1 | 19.0 | 0 | 0 | 30.0000 | S | 0 |
| 888 | 0 | 3 | 24.0 | 1 | 2 | 23.4500 | S | 0 |
| 889 | 1 | 1 | 26.0 | 0 | 0 | 30.0000 | C | 1 |
| 890 | 0 | 3 | 32.0 | 0 | 0 | 7.7500 | Q | 1 |
889 rows × 8 columns
####################################
# dates - timestamps treatment
####################################
df['timeStamp']=pd.to_datetime(df['timeStamp']) # convert stringSeries to Time
df['Hour']=df['timeStamp'].apply(lambda t : t.hour)
df['Month']=df['timeStamp'].apply(lambda t : t.month)
df['Day of Week']=df['timeStamp'].apply(lambda t : t.dayofweek)
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['Day of Week']=df['Day of Week'].map(dmap)
# fast cross checking datas
sns.pairplot(data=df,hue='COL_NAME',palette='bwr') # parametres bidon
####################################
# incrusted plot
####################################
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([.3,.3,.6,.5])
ax1.hist(train['Fare'],bins=30) # histogram of all data
ax2.hist(train[(train['Fare']<60)]['Fare'],bins=50) # incrusted zoom on Fares < 60 more populated
(array([ 15., 0., 0., 1., 1., 25., 247., 40., 35., 9., 55.,
10., 28., 28., 5., 8., 8., 19., 5., 8., 10., 39.,
28., 16., 8., 18., 14., 5., 6., 6., 2., 2., 5.,
10., 3., 1., 0., 0., 0., 7., 0., 3., 2., 10.,
8., 0., 3., 13., 2., 1.]),
array([ 0. , 1.188, 2.376, 3.564, 4.752, 5.94 , 7.128, 8.316,
9.504, 10.692, 11.88 , 13.068, 14.256, 15.444, 16.632, 17.82 ,
19.008, 20.196, 21.384, 22.572, 23.76 , 24.948, 26.136, 27.324,
28.512, 29.7 , 30.888, 32.076, 33.264, 34.452, 35.64 , 36.828,
38.016, 39.204, 40.392, 41.58 , 42.768, 43.956, 45.144, 46.332,
47.52 , 48.708, 49.896, 51.084, 52.272, 53.46 , 54.648, 55.836,
57.024, 58.212, 59.4 ]),
<BarContainer object of 50 artists>)
loans = pd.read_csv('datas/loan_data.csv')
####################################
# 2 histograms on same plot
####################################
b=25
a=0.5
# variable column
col='not.fully.paid'
#col='credit.policy'
plt.figure(figsize=(12,6))
loans[loans[col]==1]['fico'].hist(label=col+'=1',bins=b,alpha=a,color='red')
loans[loans[col]==0]['fico'].hist(label=col+'=0',bins=b,alpha=a,color='blue')
plt.legend()
plt.xlabel('FICO')
Text(0.5, 0, 'FICO')
loans = pd.read_csv('datas/loan_data.csv')
sns.lmplot(data=loans,x='fico',y='int.rate',
hue='credit.policy',col='not.fully.paid')
<seaborn.axisgrid.FacetGrid at 0x2a0603c97b0>
ad_data = pd.read_csv('datas/advertising.csv')
sns.pairplot(data=ad_data,hue='Clicked on Ad',palette='bwr')
<seaborn.axisgrid.PairGrid at 0x2a061023c10>
####################################
#interactive version
####################################
import cufflinks as cf
cf.go_offline()
train[train['Fare']<60]['Fare'].iplot(kind='hist',bins=50,color='green')
Image(filename='imgs/13-Logistic-Regression--01-Logistic Regression with Python--iplot-hist.png')
####################################
# Multiple interative plots
####################################
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import __version__
print(__version__)
5.11.0
#hist1 = go.Histogram(x=train['Fare'],name='All Fares',)
hist1 = go.Bar()
hist2 = go.Histogram(x=train[train['Fare']<60]['Fare'],name='Fare < 60')
fig1 = go.Figure(data=hist1)
fig2 = go.Figure(data=hist2)
figs = cf.subplots([fig1, fig2],shape=(1,2))
iplot(figs)
# http://localhost:8888/notebooks/Refactored_Py_DS_ML_Bootcamp-master/13-Logistic-Regression/02-Logistic%20Regression%20Project.ipynb
#
ad_data = pd.read_csv('datas/advertising.csv')
sns.jointplot(data=ad_data,x='Age',y='Daily Time Spent on Site',
kind='kde',color='red',fill=True,
marginal_kws=dict(alpha=0.1))
<seaborn.axisgrid.JointGrid at 0x1bcf8378400>
Image(filename='imgs/13-Logistic-Regression--02-Logistic Regression Project--JoinPlot.png')
train.drop(['PassengerId','Name','Sex','Ticket'],axis=1,inplace=True)
pd.get_dummies(train['Embarked'],drop_first=True)
| Q | S | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 1 |
| 3 | 0 | 1 |
| 4 | 0 | 1 |
| ... | ... | ... |
| 886 | 0 | 1 |
| 887 | 0 | 1 |
| 888 | 0 | 1 |
| 889 | 0 | 0 |
| 890 | 1 | 0 |
889 rows × 2 columns
######## CONVERT LINEAR Regression to LOGISTIC Regression ########
Image('imgs/linear-to-logistic-1.JPG')
Image('imgs/linear-to-logistic-2.JPG')
######## LogisticRegression ########
#####################################
# Prepare trainings and tests data
#####################################
from sklearn.model_selection import train_test_split
y = train['Survived']
X = train.drop('Survived',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.30,
random_state=101)
#####################################
# Train model
#####################################
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression() # create instance of Logisitc model
logmodel.fit(X=X_train,y=y_train)
#####################################
# Run predictions
#####################################
predictions = logmodel.predict(X_test)
######## K Nearest Neighbors (KNN) ########
df = pd.read_csv('datas/KNN_Project_Data')
#####################################
# Prepare trainings and tests data
#####################################
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
scaler = StandardScaler()
# fill scaler values
scaler.fit(df.drop('TARGET CLASS',axis=1))
#scale features to standardize everything to the same scale
#neither values with large scale will impact differently the values with small scale
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
#==> returns values around 0 [-1.... .. +1.....]
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
X=scaled_features
y=df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)
#####################################
# Train model
#####################################
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X=X_train, y=y_train)
#####################################
# Run predictions
#####################################
predictions = knn.predict(X_test)
predictions
#####################################
# Find Best K value
#####################################
error_rate = []
iMaxLoop = 60
for i in range(1,iMaxLoop) :
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i!=y_test))
print("**************************")
print("**classification_report** WITH K=",str(i))
print(classification_report(y_test,pred_i))
print("**confusion_matrix** WITH K=",str(i))
conf_matrix = confusion_matrix(y_test,pred_i)
print(conf_matrix)
FalNeg = conf_matrix[0,1]
FalPos = conf_matrix[1,0]
#print(confusion_matrix(y_test,pred))
#FN = confusion_matrix [1,1]
print("Errors :",str(FalNeg+FalPos))
plt.figure(figsize=(10,6))
plt.plot(range(1,iMaxLoop),error_rate,
color='blue',linestyle='dashed', #linestyle='--',
marker='o',markerfacecolor='red',markersize=10)
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.title('Error rate vs K value')
**************************
**classification_report** WITH K= 1
precision recall f1-score support
0 0.73 0.72 0.72 152
1 0.71 0.72 0.72 148
accuracy 0.72 300
macro avg 0.72 0.72 0.72 300
weighted avg 0.72 0.72 0.72 300
**confusion_matrix** WITH K= 1
[[109 43]
[ 41 107]]
Errors : 84
**************************
**classification_report** WITH K= 2
precision recall f1-score support
0 0.67 0.85 0.75 152
1 0.79 0.57 0.66 148
accuracy 0.71 300
macro avg 0.73 0.71 0.70 300
weighted avg 0.73 0.71 0.70 300
**confusion_matrix** WITH K= 2
[[129 23]
[ 64 84]]
Errors : 87
**************************
**classification_report** WITH K= 3
precision recall f1-score support
0 0.80 0.77 0.78 152
1 0.77 0.80 0.78 148
accuracy 0.78 300
macro avg 0.78 0.78 0.78 300
weighted avg 0.78 0.78 0.78 300
**confusion_matrix** WITH K= 3
[[117 35]
[ 30 118]]
Errors : 65
**************************
**classification_report** WITH K= 4
precision recall f1-score support
0 0.75 0.86 0.80 152
1 0.83 0.70 0.76 148
accuracy 0.78 300
macro avg 0.79 0.78 0.78 300
weighted avg 0.79 0.78 0.78 300
**confusion_matrix** WITH K= 4
[[130 22]
[ 44 104]]
Errors : 66
**************************
**classification_report** WITH K= 5
precision recall f1-score support
0 0.79 0.80 0.80 152
1 0.79 0.78 0.79 148
accuracy 0.79 300
macro avg 0.79 0.79 0.79 300
weighted avg 0.79 0.79 0.79 300
**confusion_matrix** WITH K= 5
[[122 30]
[ 32 116]]
Errors : 62
**************************
**classification_report** WITH K= 6
precision recall f1-score support
0 0.76 0.86 0.80 152
1 0.83 0.72 0.77 148
accuracy 0.79 300
macro avg 0.79 0.79 0.79 300
weighted avg 0.79 0.79 0.79 300
**confusion_matrix** WITH K= 6
[[130 22]
[ 41 107]]
Errors : 63
**************************
**classification_report** WITH K= 7
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 7
[[123 29]
[ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 8
precision recall f1-score support
0 0.80 0.84 0.82 152
1 0.83 0.78 0.80 148
accuracy 0.81 300
macro avg 0.81 0.81 0.81 300
weighted avg 0.81 0.81 0.81 300
**confusion_matrix** WITH K= 8
[[128 24]
[ 33 115]]
Errors : 57
**************************
**classification_report** WITH K= 9
precision recall f1-score support
0 0.81 0.81 0.81 152
1 0.81 0.81 0.81 148
accuracy 0.81 300
macro avg 0.81 0.81 0.81 300
weighted avg 0.81 0.81 0.81 300
**confusion_matrix** WITH K= 9
[[123 29]
[ 28 120]]
Errors : 57
**************************
**classification_report** WITH K= 10
precision recall f1-score support
0 0.81 0.86 0.83 152
1 0.84 0.79 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 10
[[130 22]
[ 31 117]]
Errors : 53
**************************
**classification_report** WITH K= 11
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 11
[[123 29]
[ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 12
precision recall f1-score support
0 0.81 0.84 0.82 152
1 0.83 0.79 0.81 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 12
[[128 24]
[ 31 117]]
Errors : 55
**************************
**classification_report** WITH K= 13
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 13
[[123 29]
[ 26 122]]
Errors : 55
**************************
**classification_report** WITH K= 14
precision recall f1-score support
0 0.81 0.84 0.82 152
1 0.83 0.80 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 14
[[127 25]
[ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 15
precision recall f1-score support
0 0.83 0.81 0.82 152
1 0.81 0.83 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 15
[[123 29]
[ 25 123]]
Errors : 54
**************************
**classification_report** WITH K= 16
precision recall f1-score support
0 0.81 0.84 0.82 152
1 0.83 0.80 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 16
[[127 25]
[ 29 119]]
Errors : 54
**************************
**classification_report** WITH K= 17
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 17
[[124 28]
[ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 18
precision recall f1-score support
0 0.82 0.84 0.83 152
1 0.83 0.82 0.82 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 18
[[127 25]
[ 27 121]]
Errors : 52
**************************
**classification_report** WITH K= 19
precision recall f1-score support
0 0.83 0.82 0.83 152
1 0.82 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 19
[[125 27]
[ 26 122]]
Errors : 53
**************************
**classification_report** WITH K= 20
precision recall f1-score support
0 0.82 0.82 0.82 152
1 0.82 0.81 0.81 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 20
[[125 27]
[ 28 120]]
Errors : 55
**************************
**classification_report** WITH K= 21
precision recall f1-score support
0 0.84 0.81 0.82 152
1 0.81 0.84 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 21
[[123 29]
[ 24 124]]
Errors : 53
**************************
**classification_report** WITH K= 22
precision recall f1-score support
0 0.82 0.82 0.82 152
1 0.81 0.82 0.81 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 22
[[124 28]
[ 27 121]]
Errors : 55
**************************
**classification_report** WITH K= 23
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 23
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 24
precision recall f1-score support
0 0.82 0.82 0.82 152
1 0.82 0.82 0.82 148
accuracy 0.82 300
macro avg 0.82 0.82 0.82 300
weighted avg 0.82 0.82 0.82 300
**confusion_matrix** WITH K= 24
[[125 27]
[ 27 121]]
Errors : 54
**************************
**classification_report** WITH K= 25
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 25
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 26
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 26
[[125 27]
[ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 27
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 27
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 28
precision recall f1-score support
0 0.83 0.82 0.83 152
1 0.82 0.83 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 28
[[125 27]
[ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 29
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.86 0.84 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 29
[[123 29]
[ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 30
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 30
[[124 28]
[ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 31
precision recall f1-score support
0 0.87 0.81 0.84 152
1 0.82 0.87 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 31
[[123 29]
[ 19 129]]
Errors : 48
**************************
**classification_report** WITH K= 32
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 32
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 33
precision recall f1-score support
0 0.85 0.80 0.82 152
1 0.81 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 33
[[122 30]
[ 22 126]]
Errors : 52
**************************
**classification_report** WITH K= 34
precision recall f1-score support
0 0.84 0.81 0.83 152
1 0.81 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 34
[[123 29]
[ 23 125]]
Errors : 52
**************************
**classification_report** WITH K= 35
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 35
[[123 29]
[ 22 126]]
Errors : 51
**************************
**classification_report** WITH K= 36
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 36
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 37
precision recall f1-score support
0 0.86 0.82 0.84 152
1 0.82 0.86 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 37
[[125 27]
[ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 38
precision recall f1-score support
0 0.85 0.83 0.84 152
1 0.83 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 38
[[126 26]
[ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 39
precision recall f1-score support
0 0.86 0.82 0.84 152
1 0.82 0.86 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 39
[[125 27]
[ 21 127]]
Errors : 48
**************************
**classification_report** WITH K= 40
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 40
[[125 27]
[ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 41
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.86 0.84 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 41
[[123 29]
[ 21 127]]
Errors : 50
**************************
**classification_report** WITH K= 42
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 42
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 43
precision recall f1-score support
0 0.86 0.82 0.84 152
1 0.82 0.86 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 43
[[124 28]
[ 21 127]]
Errors : 49
**************************
**classification_report** WITH K= 44
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 44
[[124 28]
[ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 45
precision recall f1-score support
0 0.85 0.82 0.83 152
1 0.82 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 45
[[124 28]
[ 22 126]]
Errors : 50
**************************
**classification_report** WITH K= 46
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 46
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 47
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 47
[[124 28]
[ 23 125]]
Errors : 51
**************************
**classification_report** WITH K= 48
precision recall f1-score support
0 0.84 0.83 0.83 152
1 0.83 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 48
[[126 26]
[ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 49
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 49
[[124 28]
[ 24 124]]
Errors : 52
**************************
**classification_report** WITH K= 50
precision recall f1-score support
0 0.84 0.83 0.83 152
1 0.83 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 50
[[126 26]
[ 24 124]]
Errors : 50
**************************
**classification_report** WITH K= 51
precision recall f1-score support
0 0.85 0.83 0.84 152
1 0.83 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 51
[[126 26]
[ 23 125]]
Errors : 49
**************************
**classification_report** WITH K= 52
precision recall f1-score support
0 0.84 0.84 0.84 152
1 0.83 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 52
[[127 25]
[ 24 124]]
Errors : 49
**************************
**classification_report** WITH K= 53
precision recall f1-score support
0 0.85 0.83 0.84 152
1 0.83 0.85 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 53
[[126 26]
[ 22 126]]
Errors : 48
**************************
**classification_report** WITH K= 54
precision recall f1-score support
0 0.84 0.84 0.84 152
1 0.84 0.84 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 54
[[128 24]
[ 24 124]]
Errors : 48
**************************
**classification_report** WITH K= 55
precision recall f1-score support
0 0.85 0.82 0.84 152
1 0.82 0.85 0.84 148
accuracy 0.84 300
macro avg 0.84 0.84 0.84 300
weighted avg 0.84 0.84 0.84 300
**confusion_matrix** WITH K= 55
[[125 27]
[ 22 126]]
Errors : 49
**************************
**classification_report** WITH K= 56
precision recall f1-score support
0 0.83 0.82 0.83 152
1 0.82 0.83 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 56
[[125 27]
[ 25 123]]
Errors : 52
**************************
**classification_report** WITH K= 57
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 57
[[125 27]
[ 23 125]]
Errors : 50
**************************
**classification_report** WITH K= 58
precision recall f1-score support
0 0.84 0.82 0.83 152
1 0.82 0.84 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 58
[[125 27]
[ 24 124]]
Errors : 51
**************************
**classification_report** WITH K= 59
precision recall f1-score support
0 0.85 0.81 0.83 152
1 0.81 0.85 0.83 148
accuracy 0.83 300
macro avg 0.83 0.83 0.83 300
weighted avg 0.83 0.83 0.83 300
**confusion_matrix** WITH K= 59
[[123 29]
[ 22 126]]
Errors : 51
Text(0.5, 1.0, 'Error rate vs K value')
######## Decision Trees and Random Forests ########
df = pd.read_csv('datas/kyphosis.csv')
#####################################
from sklearn.model_selection import train_test_split
X = df.drop('Kyphosis',axis=1)
y = df['Kyphosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
precision recall f1-score support
absent 0.88 0.75 0.81 20
present 0.38 0.60 0.46 5
accuracy 0.72 25
macro avg 0.63 0.68 0.64 25
weighted avg 0.78 0.72 0.74 25
**confusion_matrix**
[[15 5]
[ 2 3]]
#####################################
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print("**classification_report**",)
print(classification_report(y_test,rfc_pred))
print("**confusion_matrix**")
print(confusion_matrix(y_test,rfc_pred))
**classification_report**
precision recall f1-score support
absent 0.86 0.95 0.90 20
present 0.67 0.40 0.50 5
accuracy 0.84 25
macro avg 0.77 0.68 0.70 25
weighted avg 0.82 0.84 0.82 25
**confusion_matrix**
[[19 1]
[ 3 2]]
######## Support Vector Machines (SVM) ########
#####################################
#####################################
# Check predictions results
#####################################
#####################################
Image(filename='imgs/confusionMatrix.JPG')
= 150 /165 = 0.91
Accuracy is useful when target classes are well balanced, BUT not a good choice with **unbalanced** classes!
= 15 /165 = 0.09
TP / (TP + FN)
= 100 / 105 = 0.95
TP / (TP + FP)
= 100 / 110 = 0.91
While recall expresses the ability to find all relevant instances in a dataset
precision expresses the proportion of the data points our model says was relevant actually were relevant.
Image('imgs/confusionMatrixFormulas.png')
Here are three common evaluation metrics for regression problems:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors:
$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$(somme des différences (en valeur absolue) entre valeur prédite et valeur réelle / (nombre de prédictions)
==> large errors not punished)
Mean Squared Error (MSE) is the mean of the squared errors:
$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$(somme des {différences (en valeur absolue) entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)
==> unité -> unité^2)
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$RacineCarré[somme des {différences* entre valeur prédite et valeur réelle}^2 / (nombre de prédictions)]
Comparing these metrics:
All of these are loss functions, because we want to minimize them.
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import confusion_matrix
print("**classification_report**",)
print(classification_report(y_test,predictions))
print("**confusion_matrix**")
print(confusion_matrix(y_test,predictions))
**classification_report**
precision recall f1-score support
0 0.83 0.90 0.86 163
1 0.82 0.71 0.76 104
accuracy 0.83 267
macro avg 0.83 0.81 0.81 267
weighted avg 0.83 0.83 0.83 267
**confusion_matrix**
[[147 16]
[ 30 74]]